import torch
import numpy as np
import os
import sys
import pickle
import argparse
import matplotlib.pyplot as plt
from copy import deepcopy
from tqdm import tqdm
from datetime import datetime
from einops import rearrange

from constants import DT
from constants import PUPPET_GRIPPER_JOINT_OPEN
from utils import load_data, load_train_data, load_val_data # data functions
from utils import sample_box_pose, sample_insertion_pose # robot functions
from utils import compute_dict_mean, set_seed, detach_dict # helper functions
from policy import ACTPolicy, CNNMLPPolicy, HATACTPolicy
from visualize_episodes import save_videos

import wandb

from sim_env import BOX_POSE

import IPython
e = IPython.embed

def plot_cls_attention(cls_weights, cls_weights2, timestep, dir):
    cls_weights_cpu = [[weight.cpu().numpy() for weight in step] for step in cls_weights]
    cls_weights2_cpu = [[weight.cpu().numpy() for weight in step] for step in cls_weights2]
    
    plt.figure()
    for i in range(len(cls_weights_cpu[0])):
        if i == 3:
            plt.plot([step[i] for step in cls_weights_cpu], label=f'Arm1 Decoder')
            plt.plot([step[i] for step in cls_weights2_cpu], label=f'Arm2 Decoder')
    plt.xlabel('Timestep')
    plt.ylabel('Attention Weight Sum')
    plt.xlim(0, 500)
    plt.ylim(0, 25)
    plt.title(f'Attention Weights for CLS tokens in Multi-arm Decoder')
    plt.legend()
    plt.savefig(f'{dir}/{timestep}.png')
    plt.close()

def main(args):
    set_seed(1)
    if not args['eval'] and args['wandb']:
        if args['resume'] and args['resume_ckpt_path'] is not None:
            checkpoint = torch.load(args['resume_ckpt_path'])
            run_id = checkpoint.get('run_id', None)
            run = wandb.init(
                entity="aloha",
                project=args['project'], 
                group=args['task_name'], 
                name=args['name'], 
                notes=args['notes'],
                resume="must",
                id=run_id,
                config=args
                )
        else:
            run = wandb.init(
                entity="aloha",
                project=args['project'], 
                group=args['task_name'], 
                name=args['name'], 
                config=args
                )
        wandb.run.save(sys.argv[1], policy='now')
    # command line parameters
    is_eval = args['eval']
    is_train_eval = args['train_eval']
    ckpt_dir = args['ckpt_dir']
    policy_class = args['policy_class']
    onscreen_render = args['onscreen_render']
    task_name = args['task_name']
    batch_size_train = args['batch_size']
    batch_size_val = args['batch_size']
    num_epochs = args['num_epochs']
    num_blocks = args['num_blocks']
    dec_layers = args['dec_layers']
    save_video = args['save_video']

    # get task parameters
    is_sim = task_name[:4] == 'sim_'
    if is_sim:
        from constants import SIM_TASK_CONFIGS
        task_config = SIM_TASK_CONFIGS[task_name]
    else:
        from constants import TASK_CONFIGS
        task_config = TASK_CONFIGS[task_name]
    dataset_dir = task_config['dataset_dir']
    num_episodes = task_config['num_episodes']
    episode_len = task_config['episode_len']
    camera_names = task_config['camera_names']

    # fixed parameters
    state_dim = 14
    lr_backbone = 1e-5
    backbone = 'resnet18'
    if policy_class == 'ACT':
        enc_layers = 4
        dec_layers = 7
        nheads = 8
        policy_config = {'lr': args['lr'],
                         'num_queries': args['chunk_size'],
                         'kl_weight': args['kl_weight'],
                         'hidden_dim': args['hidden_dim'],
                         'dim_feedforward': args['dim_feedforward'],
                         'lr_backbone': lr_backbone,
                         'backbone': backbone,
                         'enc_layers': enc_layers,
                         'dec_layers': dec_layers,
                         'nheads': nheads,
                         'camera_names': camera_names,
                         }
    elif policy_class == 'HATACT':
        enc_layers = 4
        # dec_layers = 7
        nheads = 8
        policy_config = {'num_blocks' : num_blocks,
                         'lr': args['lr'],
                         'num_queries': args['chunk_size'],
                         'kl_weight': args['kl_weight'],
                         'hidden_dim': args['hidden_dim'],
                         'dim_feedforward': args['dim_feedforward'],
                         'lr_backbone': lr_backbone,
                         'backbone': backbone,
                         'enc_layers': enc_layers,
                         'dec_layers': dec_layers,
                         'nheads': nheads,
                         'camera_names': camera_names,
                         }
    elif policy_class == 'CNNMLP':
        policy_config = {'lr': args['lr'], 'lr_backbone': lr_backbone, 'backbone' : backbone, 'num_queries': 1,
                         'camera_names': camera_names,}
    else:
        raise NotImplementedError

    config = {
        'num_epochs': num_epochs,
        'ckpt_dir': ckpt_dir,
        'episode_len': episode_len,
        'state_dim': state_dim,
        'lr': args['lr'],
        'policy_class': policy_class,
        'onscreen_render': onscreen_render,
        'policy_config': policy_config,
        'task_name': task_name,
        'seed': args['seed'],
        'temporal_agg': args['temporal_agg'],
        'camera_names': camera_names,
        'real_robot': not is_sim
    }

    if is_eval:
        
        ckpt_names = [os.path.join(ckpt_dir, f'weights/best/policy_best.ckpt')]
        results = []
        for ckpt_name in ckpt_names:
            success_rate, avg_return = eval_bc(config, ckpt_name, save_episode=save_video)
            results.append([ckpt_name, success_rate, avg_return])

        for ckpt_name, success_rate, avg_return in results:
            print(f'{ckpt_name}: {success_rate=} {avg_return=}')
        print()
        exit()

    train_dataloader, stats, _ = load_train_data(dataset_dir + '/train', 40, camera_names, batch_size_train)

    # save dataset stats
    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)
    if not os.path.isdir(f'{ckpt_dir}/plots'):
        os.makedirs(f'{ckpt_dir}/plots')

    if not os.path.isdir(f'{ckpt_dir}/weights'):
        os.makedirs(f'{ckpt_dir}/weights')

    if not os.path.isdir(f'{ckpt_dir}/weights/best'):
        os.makedirs(f'{ckpt_dir}/weights/best')

    if not os.path.isdir(f'{ckpt_dir}/videos'):
        os.makedirs(f'{ckpt_dir}/videos')
    stats_path = os.path.join(ckpt_dir, f'dataset_stats.pkl')
    with open(stats_path, 'wb') as f:
        pickle.dump(stats, f)

    # copy files and save to folder
    if not os.path.isdir(f'{ckpt_dir}/files'):
        os.makedirs(f'{ckpt_dir}/files')
    os.system(f'cp {sys.argv[1]} {ckpt_dir}/files/config.py')
    os.system(f'cp train.py {ckpt_dir}/files/train.py')
    os.system(f'cp policy.py {ckpt_dir}/files/policy.py')
    os.system(f'cp detr/models/hatransformer.py {ckpt_dir}/files/hatransformer.py')
    os.system(f'cp detr/models/detr_vae.py {ckpt_dir}/files/detr_vae.py')

    print(args)
    ckpt_paths = train_bc(train_dataloader, dataset_dir, num_episodes, camera_names, batch_size_train, config, args)

    if is_train_eval:
        results = []
        for ckpt_name in ckpt_paths:
            success_rate, avg_return = eval_bc(config, ckpt_name, save_episode=save_video)
            results.append([ckpt_name, success_rate, avg_return])

        for ckpt_name, success_rate, avg_return in results:
            print(f'{ckpt_name}: {success_rate=} {avg_return=}')

        highest_entry = max(results, key=lambda x: x[1])
        ckpt_name, highest_success_rate, avg_return = highest_entry
        if args['wandb']:
            wandb.log({
                "success_rate": highest_success_rate,
                "avg_return": avg_return
                })  
        
    if args['wandb']:
        wandb.finish()


def make_policy(policy_class, policy_config):
    if policy_class == 'ACT':
        policy = ACTPolicy(policy_config)
    elif policy_class == 'HATACT':
        policy = HATACTPolicy(policy_config)
    elif policy_class == 'CNNMLP':
        policy = CNNMLPPolicy(policy_config)
    else:
        raise NotImplementedError
    return policy


def make_optimizer(policy_class, policy):
    if policy_class == 'ACT':
        optimizer = policy.configure_optimizers()
    elif policy_class == 'HATACT':
        optimizer = policy.configure_optimizers()
    elif policy_class == 'CNNMLP':
        optimizer = policy.configure_optimizers()
    else:
        raise NotImplementedError
    return optimizer


def get_image(ts, camera_names):
    curr_images = []
    for cam_name in camera_names:
        curr_image = rearrange(ts.observation['images'][cam_name], 'h w c -> c h w')
        curr_images.append(curr_image)
    curr_image = np.stack(curr_images, axis=0)
    curr_image = torch.from_numpy(curr_image / 255.0).float().cuda().unsqueeze(0)
    return curr_image


def eval_bc(config, ckpt_name, save_episode=True):
    set_seed(1001)
    ckpt_dir = config['ckpt_dir']
    state_dim = config['state_dim']
    real_robot = config['real_robot']
    policy_class = config['policy_class']
    onscreen_render = config['onscreen_render']
    policy_config = config['policy_config']
    camera_names = config['camera_names']
    max_timesteps = config['episode_len']
    task_name = config['task_name']
    temporal_agg = config['temporal_agg']
    onscreen_cam = 'angle'

    # load policy and stats
    ckpt_path = ckpt_name
    policy = make_policy(policy_class, policy_config)
    loading_status = policy.load_state_dict(torch.load(ckpt_path))
    print(loading_status)
    policy.cuda()
    policy.eval()
    print(f'Loaded: {ckpt_path}')
    stats_path = os.path.join(ckpt_dir, f'dataset_stats.pkl')
    with open(stats_path, 'rb') as f:
        stats = pickle.load(f)

    pre_process = lambda s_qpos: (s_qpos - stats['qpos_mean']) / stats['qpos_std']
    post_process = lambda a: a * stats['action_std'] + stats['action_mean']

    # load environment
    if real_robot:
        from robot_utils import move_grippers # requires aloha
        from real_env import make_real_env # requires aloha
        env = make_real_env(init_node=True)
        env_max_reward = 0
    else:
        from sim_env import make_sim_env
        env = make_sim_env(task_name)
        env_max_reward = env.task.max_reward

    query_frequency = 50 # policy_config['num_queries']
    if temporal_agg:
        query_frequency = 1
        num_queries = 50 # policy_config['num_queries']

    max_timesteps = int(max_timesteps) # may increase for real-world tasks

    num_rollouts = 10
    episode_returns = []
    highest_rewards = []
    for rollout_id in tqdm(range(num_rollouts)):

        if not os.path.isdir(f'fig_slot/{rollout_id}'):
            os.makedirs(f'fig_slot/{rollout_id}')   
            os.makedirs(f'fig_slot/{rollout_id}/img')  
            os.makedirs(f'fig_slot/{rollout_id}/attn')
        rollout_id += 0
        ### set task
        if 'sim_transfer_cube' in task_name:
            BOX_POSE[0] = sample_box_pose() # used in sim reset
        elif 'sim_insertion' in task_name:
            BOX_POSE[0] = np.concatenate(sample_insertion_pose()) # used in sim reset

        ts = env.reset()

        ### onscreen render
        if onscreen_render:
            fig, ax = plt.subplots()
            plt_img = ax.imshow(env._physics.render(height=480, width=640, camera_id=onscreen_cam))
            plt.ion()

            

        ### evaluation loop
        if temporal_agg:
            all_time_actions = torch.zeros([max_timesteps, max_timesteps+num_queries, state_dim]).cuda()

        qpos_history = torch.zeros((1, max_timesteps, state_dim)).cuda()
        image_list = [] # for visualization
        qpos_list = []
        target_qpos_list = []
        rewards = []
        cls_1s = []
        cls_2s = []

        with torch.inference_mode():
            for t in range(max_timesteps):
                ### update onscreen render and wait for DT
                if onscreen_render:
                    image = env._physics.render(height=480, width=640, camera_id=onscreen_cam)
                    plt_img.set_data(image)
                    
                    plt.pause(DT)

                    fig.savefig(f'fig_slot/{rollout_id}/img/{t}.png')
                    

                ### process previous timestep to get qpos and image_list
                obs = ts.observation
                if 'images' in obs:
                    image_list.append(obs['images'])
                else:
                    image_list.append({'main': obs['image']})
                qpos_numpy = np.array(obs['qpos'])
                qpos = pre_process(qpos_numpy)
                qpos = torch.from_numpy(qpos).float().cuda().unsqueeze(0)
                qpos_history[:, t] = qpos
                curr_image = get_image(ts, camera_names)

                ### query policy
                if config['policy_class'] == "ACT":
                    if t % query_frequency == 0:
                        all_actions = policy(qpos, curr_image)
                    if temporal_agg:
                        all_time_actions[[t], t:t+num_queries] = all_actions
                        actions_for_curr_step = all_time_actions[:, t]
                        actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
                        actions_for_curr_step = actions_for_curr_step[actions_populated]
                        k = 0.01
                        exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
                        exp_weights = exp_weights / exp_weights.sum()
                        exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
                        raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
                    else:
                        raw_action = all_actions[:, t % query_frequency]
                elif config['policy_class'] == "HATACT":
                    if t % query_frequency == 0:
                        all_actions, cls_1, cls_2 = policy(qpos, curr_image)
                        cls_1s.append(cls_1)
                        cls_2s.append(cls_2)
                    if temporal_agg:
                        all_time_actions[[t], t:t+num_queries] = all_actions
                        actions_for_curr_step = all_time_actions[:, t]
                        actions_populated = torch.all(actions_for_curr_step != 0, axis=1)
                        actions_for_curr_step = actions_for_curr_step[actions_populated]
                        k = 0.01
                        exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step)))
                        exp_weights = exp_weights / exp_weights.sum()
                        exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1)
                        raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True)
                    else:
                        raw_action = all_actions[:, t % query_frequency]
                elif config['policy_class'] == "CNNMLP":
                    raw_action = policy(qpos, curr_image)
                else:
                    raise NotImplementedError

                ### post-process actions
                raw_action = raw_action.squeeze(0).cpu().numpy()
                action = post_process(raw_action)
                target_qpos = action

                ### step the environment
                ts = env.step(target_qpos)

                ### for visualization
                qpos_list.append(qpos_numpy)
                target_qpos_list.append(target_qpos)
                rewards.append(ts.reward)

                plot_cls_attention(cls_1s, cls_2s, t, f'fig_slot/{rollout_id}/attn')
               #  plot_cls_attention(, 2, 'cls_2s_attention_weights.png')


            plt.close(fig)
        if real_robot:
            move_grippers([env.puppet_bot_left, env.puppet_bot_right], [PUPPET_GRIPPER_JOINT_OPEN] * 2, move_time=0.5)  # open
            pass

        rewards = np.array(rewards)
        episode_return = np.sum(rewards[rewards!=None])
        episode_returns.append(episode_return)
        episode_highest_reward = np.max(rewards)
        highest_rewards.append(episode_highest_reward)
        tqdm.write(f'Rollout {rollout_id:2d}: Reward: {episode_highest_reward}/{env_max_reward}, Success: {episode_highest_reward==env_max_reward}, Success rate: {np.mean(np.array(highest_rewards) == env_max_reward):.2f}')


        if episode_highest_reward==env_max_reward:
            continue

        if save_episode:
            save_videos(image_list, DT, video_path=os.path.join(ckpt_dir, f'videos/video{rollout_id}.mp4'))

    success_rate = np.mean(np.array(highest_rewards) == env_max_reward)
    avg_return = np.mean(episode_returns)
    summary_str = f'\nSuccess rate: {success_rate}\nAverage return: {avg_return}\n\n'
    for r in range(env_max_reward+1):
        more_or_equal_r = (np.array(highest_rewards) >= r).sum()
        more_or_equal_r_rate = more_or_equal_r / num_rollouts
        summary_str += f'Reward >= {r}: {more_or_equal_r}/{num_rollouts} = {more_or_equal_r_rate*100}%\n'

    print(summary_str)

    # save success rate to txt
    ckpt_name = ckpt_name.split('/')[-1]
    result_file_name = 'result_' + ckpt_name.split('.')[0] + '.txt'
    with open(os.path.join(ckpt_dir, result_file_name), 'w') as f:
        f.write(summary_str)
        f.write(repr(episode_returns))
        f.write('\n\n')
        f.write(repr(highest_rewards))

    return success_rate, avg_return


def forward_pass(data, policy, loss_type):
    image_data, qpos_data, action_data, is_pad = data
    image_data, qpos_data, action_data, is_pad = image_data.cuda(), qpos_data.cuda(), action_data.cuda(), is_pad.cuda()
    return_value =  policy(qpos_data, image_data, action_data, is_pad, loss_type) # TODO remove None
    return return_value

def update_top_models(top_models, new_model, max_models=6):
    if len(top_models) < max_models:
        top_models.append(new_model)
    else:
        max_loss_index = max(range(len(top_models)), key=lambda i: top_models[i][1])
        if top_models[max_loss_index][1] > new_model[1]:
            top_models[max_loss_index] = new_model
    top_models.sort(key=lambda x: x[1])

def save_model_checkpoints(top_models, ckpt_dir, seed):
    for model_info in top_models:
        best_epoch, min_val_loss, best_state_dict = model_info
        ckpt_path = os.path.join(ckpt_dir, f'weights/best/policy_epoch_{best_epoch}_seed_{seed}.ckpt')
        torch.save(best_state_dict, ckpt_path)

def train_bc(train_dataloader, dataset_dir, num_episodes, camera_names, batch_size_val, config, args):
    num_epochs = config['num_epochs']
    ckpt_dir = config['ckpt_dir']
    seed = config['seed']
    policy_class = config['policy_class']
    policy_config = config['policy_config']
    loss_type = args['loss']

    set_seed(seed)

    policy = make_policy(policy_class, policy_config)
    policy.cuda()
    optimizer = make_optimizer(policy_class, policy)

    train_history = []
    validation_history = []
    min_val_loss = np.inf
    top_models = []
    best_ckpt_info = None

    resume = args.get('resume', False)
    resume_ckpt_path = args.get('resume_ckpt_path', None)
    if resume and resume_ckpt_path is not None:
        if os.path.isfile(resume_ckpt_path):
            checkpoint = torch.load(resume_ckpt_path)
            policy.load_state_dict(checkpoint['model_state'])
            optimizer.load_state_dict(checkpoint['optimizer_state'])
            start_epoch = checkpoint['epoch'] + 1
            train_history = checkpoint.get('train_history', [])
            validation_history = checkpoint.get('validation_history', [])
            top_models = checkpoint.get('top_models', [])
            min_val_loss = checkpoint.get('min_val_loss', np.inf)
            run_id = checkpoint.get('run_id', None)
            print(f"Resumed training from checkpoint {resume_ckpt_path} at epoch {start_epoch}")
            print(f'Start logging to wandb id: {run_id}')

    else:
        with open(os.path.join(ckpt_dir, 'epoch_history.txt'), 'w') as f:
            f.write('epoch\t\ttrain loss\t\tval loss\t\tl1 loss\n')
    print_args = ' '.join(sys.argv)
    print_args += '\n\n'
    for key, value in args.items():
        print_args += f"{key}: {value}\n"    
    print_args += f"\nTraining Start : {datetime.now()}\n"
    with open(os.path.join(ckpt_dir, 'train_info.txt'), 'w') as f:
        f.write(print_args)
    for epoch in tqdm(range(num_epochs)):
        # tqdm.write(f'\nEpoch {epoch}')
        # validation
        with torch.inference_mode():
            policy.eval()
            epoch_dicts = []
            val_dataloader, stats, _ = load_val_data(dataset_dir + '/val', 10, camera_names, batch_size_val)
            for batch_idx, data in enumerate(val_dataloader):
                forward_dict = forward_pass(data, policy, loss_type)
                epoch_dicts.append(forward_dict)
            epoch_summary = compute_dict_mean(epoch_dicts)
            validation_history.append(epoch_summary)
            epoch_val_loss = epoch_summary['loss']
            epoch_val_l1_loss = epoch_summary['l1']

            # epoch_val_loss = epoch_summary['loss']
            # if epoch_val_loss < min_val_loss:
            #     min_val_loss = epoch_val_loss
            #     best_ckpt_info = (epoch, min_val_loss, deepcopy(policy.state_dict()))
            new_model_info = (epoch, epoch_val_loss, deepcopy(policy.state_dict()))
            if epoch >= num_epochs * 0.8:
                update_top_models(top_models, new_model_info)

        # tqdm.write(f'Val loss:   {epoch_val_loss:.5f}')
        # summary_string = ''
        # for k, v in epoch_summary.items():
        #     summary_string += f'{k}: {v.item():.3f} '
        # print(summary_string)

        # training
        policy.train()
        optimizer.zero_grad()
        # train_dataloader, stats, _ = load_train_data(dataset_dir + '/train', 40, camera_names, batch_size_val)
        for batch_idx, data in enumerate(train_dataloader):
            forward_dict = forward_pass(data, policy, loss_type)
            # backward
            loss = forward_dict['loss']
            l1_loss = forward_dict['l1']
            # _loss = forward_dict['kl_both']
            # kl_loss_arm1 = forward_dict['kl_arm1']
            # kl_loss_arm2 = forward_dict['kl_arm2']
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_history.append(detach_dict(forward_dict))
        epoch_summary = compute_dict_mean(train_history[(batch_idx+1)*epoch:(batch_idx+1)*(epoch+1)])
        epoch_train_loss = epoch_summary['loss']
        epoch_l1_loss = epoch_summary['l1']
        # epoch_kl_loss = epoch_summary['kl_both']
        if args['wandb']:
            # wandb.log({"Val Loss": epoch_val_loss, "Train Loss": epoch_train_loss, "L1 Loss": l1_loss,  "KL Loss (Both)": kl_loss,  "KL Loss (Arm1)": kl_loss_arm1, "KL Loss (Arm2)": kl_loss_arm2})
            wandb.log({'epoch': epoch, "Val Loss": epoch_val_loss, "Val L1 Loss": epoch_val_l1_loss,  "Train Loss": epoch_train_loss, "L1 Loss": epoch_l1_loss})
        tqdm.write(f'Epoch: {epoch:4d}\tVal loss: {epoch_val_loss:.5f}\tTrain loss: {epoch_train_loss:.5f}\tL1 loss: {l1_loss:.5f}') #\tKL loss arm1: {kl_loss_arm1:.5f}\tKL loss arm2: {kl_loss_arm2:.5f}')
        # summary_string = ''
        # for k, v in epoch_summary.items():
        #     summary_string += f'{k}: {v.item():.3f} '
        # print(summary_string)
        with open(os.path.join(ckpt_dir, 'epoch_history.txt'), 'a') as f:
            f.write(f'{epoch:4d}\t\t{epoch_train_loss:.5f}\t\t{epoch_val_loss:.5f}\t\t{l1_loss:.5f}\n') # \t\t{kl_loss_arm1:.5f}\t\t{kl_loss_arm2:.5f}\n')


        if epoch % 100 == 0:
            ckpt_path = os.path.join(ckpt_dir, f'weights/policy_epoch_{epoch}_seed_{seed}.ckpt')
            # torch.save(policy.state_dict(), ckpt_path)
            plot_history(train_history, validation_history, epoch, ckpt_dir, seed)
        if epoch % 1000 == 998:
            ckpt_path = os.path.join(ckpt_dir, f'weights/policy_epoch_{epoch}_seed_{seed}.ckpt')
            torch.save({
                'epoch': epoch,
                'model_state': policy.state_dict(),
                'optimizer_state': optimizer.state_dict(),
                'train_history': train_history,
                'validation_history': validation_history,
                'top_models': top_models,
                'run_id': wandb.run.id if args['wandb'] else 'None'
            }, ckpt_path)
            tqdm.write(f'Saved checkpoint to {ckpt_path}')

    ckpt_path = os.path.join(ckpt_dir, f'weights/policy_last.ckpt')
    epoch = num_epochs - 1
    # torch.save(policy.state_dict(), ckpt_path)
    torch.save({
        'epoch': epoch,
        'model_state': policy.state_dict(),
        'optimizer_state': optimizer.state_dict(),
        'train_history': train_history,
        'validation_history': validation_history,
        'top_models': top_models,
        'run_id': wandb.run.id if args['wandb'] else 'None'
    }, ckpt_path)

    # best_epoch, min_val_loss, best_state_dict = best_ckpt_info
    # ckpt_path = os.path.join(ckpt_dir, f'weights/policy_epoch_{best_epoch}_seed_{seed}.ckpt')
    # torch.save(best_state_dict, ckpt_path)
    print(f'Training finished:\n')
    result_summary = ''
    ckpt_paths = []
    for model_info in top_models:
        best_epoch, min_val_loss, best_state_dict = model_info
        ckpt_path = os.path.join(ckpt_dir, f'weights/best/policy_epoch_{best_epoch}_seed_{seed}.ckpt')
        ckpt_paths.append(ckpt_path)
        torch.save(best_state_dict, ckpt_path)
        result_summary += f'Seed {seed}, val loss {min_val_loss:.6f} at epoch {best_epoch}\n'
    print(result_summary)
    with open(os.path.join(ckpt_dir, 'train_info.txt'), 'a') as f:
        f.write(f"Training End : {datetime.now()}\n")
        f.write(f'Training Result:\n{result_summary}')

    # save training curves
    plot_history(train_history, validation_history, num_epochs, ckpt_dir, seed)

    return ckpt_paths


def plot_history(train_history, validation_history, num_epochs, ckpt_dir, seed):
    # save training curves
    for key in train_history[0]:
        plot_path = os.path.join(ckpt_dir, f'plots/train_val_{key}_seed_{seed}.png')
        plt.figure()
        train_values = [summary[key].item() for summary in train_history]
        val_values = [summary[key].item() for summary in validation_history]
        plt.plot(np.linspace(0, num_epochs-1, len(train_history)), train_values, label='train')
        plt.plot(np.linspace(0, num_epochs-1, len(validation_history)), val_values, label='validation')
        # plt.ylim([-0.1, 1])
        # plt.tight_layout()
        plt.legend()
        plt.title(key)
        plt.savefig(plot_path)
    tqdm.write(f'Saved plots to {ckpt_dir}')


if __name__ == '__main__':
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--project', action='store', type=str, help='project', required=True)
    # parser.add_argument('--name', action='store', type=str, help='name', required=True)
    # parser.add_argument('--eval', action='store_true')
    # parser.add_argument('--train_eval', action='store_true')
    # parser.add_argument('--wandb', action='store_true')
    # parser.add_argument('--onscreen_render', action='store_true')
    # parser.add_argument('--save_video', action='store_true')
    # parser.add_argument('--ckpt_dir', action='store', type=str, help='ckpt_dir', required=True)
    # parser.add_argument('--policy_class', action='store', type=str, help='policy_class, capitalize', required=True)
    # parser.add_argument('--task_name', action='store', type=str, help='task_name', required=True)
    # parser.add_argument('--batch_size', action='store', type=int, help='batch_size', required=True)
    # parser.add_argument('--seed', action='store', type=int, help='seed', required=True)
    # parser.add_argument('--num_epochs', action='store', type=int, help='num_epochs', required=True)
    # parser.add_argument('--loss', action='store', type=str, help='loss', required=True)
    # parser.add_argument('--lr', action='store', type=float, help='lr', required=True)
    # parser.add_argument('--resume', action='store_true')
    # parser.add_argument('--resume_ckpt_dir', action='store', type=str, help='ckpt_dir', required=False)
    

    # # for HATACT
    # parser.add_argument('--dual_latent', action='store_true', help='use latent z1 z2', required=False)
    # parser.add_argument('--single_latent', action='store_true', help='use latent z', required=False)
    # parser.add_argument('--dec_layers', action='store', type=int, help='num_dec_layers', required=True)
    # parser.add_argument('--num_blocks', action='store', type=int, help='KL Weight', required=True)
    # parser.add_argument('--kl_weight', action='store', type=int, help='KL Weight', required=False)
    # parser.add_argument('--chunk_size', action='store', type=int, help='chunk_size', required=False)
    # parser.add_argument('--hidden_dim', action='store', type=int, help='hidden_dim', required=False)
    # parser.add_argument('--dim_feedforward', action='store', type=int, help='dim_feedforward', required=False)
    # parser.add_argument('--temporal_agg', action='store_true')
    
    # main(vars(parser.parse_args()))


    project = False
    name = ''
    eval = False
    resume = False
    resume_ckpt_path = None
    train_eval = False
    wandb_ = False
    onscreen_render = False
    save_video = False
    ckpt_dir = 'runs/test'
    policy_class = 'HATACT'
    task_name = 'test'
    batch_size = 8
    seed = 0
    num_epochs = 5000  
    loss = 'l1'
    lr = 1e-4
    dual_latent = False
    single_latent = False
    enc_layers = 4
    dec_layers = 7
    num_blocks = 1
    kl_weight = 1
    chunk_size = 50
    hidden_dim = 512
    dim_feedforward = 2048
    temporal_agg = False
    notes = ''

    if sys.argv[1]:
        exec(open(f'{sys.argv[1]}').read())
        print(f'Overriding config with {sys.argv[1]}')
    else:
        exit()
    
    args = {'project': project, 'name': name, 'eval': eval, 'resume': resume, 'resume_ckpt_path': resume_ckpt_path, 'train_eval': train_eval, 'wandb': wandb_, 'onscreen_render': onscreen_render, 'save_video': save_video, 'ckpt_dir': ckpt_dir, 'policy_class': policy_class, 'task_name': task_name, 'batch_size': batch_size, 'seed': seed, 'num_epochs': num_epochs, 'loss': loss, 'lr': lr, 'dual_latent': dual_latent, 'single_latent': single_latent, 'enc_layers': enc_layers, 'dec_layers': dec_layers, 'num_blocks': num_blocks, 'kl_weight': kl_weight, 'chunk_size': chunk_size, 'hidden_dim': hidden_dim, 'dim_feedforward': dim_feedforward, 'temporal_agg': temporal_agg, 'notes': notes}

    
    # main(vars(parser.parse_args()))
    main(args)